import os
import json
import csv
from sklearn.model_selection import train_test_split
from pymongo import MongoClient
import pymongo
client = MongoClient(port=27017)
db=client['tiktok']
obj={
    '_id':'',
    'text_feature':{
        'text':'',
        'stickerText':[],
    },
    'video_feature':{
        'text_embed':[],
        'img_embed':{},
        'audio':{
            'yamnet':[]
        },
        'editing':[],
        'label':{},
        'residual':{}
    },
    'img_feature':{

    }
}
pastht=db.list_collection_names()
labels=[]
with open('D:\\Work\\Tool\\tiktok\\all\\infos.tsv', 'r', encoding='utf-8',
          newline='\n') as filename_input:
    reader = csv.reader(filename_input, delimiter='\t')
    for line in reader:
        labels.append(line[1])
for fname in os.listdir('D:\\Work\\Tool\\tiktok\\TikToks\\'):
    f=False
    for pht in pastht:
        if pht in fname:
            f=True
    if f :
        continue
    if fname.endswith('.json'):
        with open('D:\\Work\\Tool\\tiktok\\TikToks\\' + fname, 'r', encoding='utf-8', newline='\n') as filename_input:
            hashtag=fname.split('_')[1]
            ranking = 0
            for line in filename_input:
                try:
                    z = json.loads(line)
                except:
                    continue
                text=''
                stext=[]
                id=''
                if 'id' in z.keys():
                    id = z['id']

                    if 'stickerTextList' in z.keys():
                        for item in z['stickerTextList']:
                            if type(item['stickerText']) is list:
                                for st in item['stickerText']:
                                    stext.append(st)
                    elif 'stickersOnItem' in z.keys():
                        for item in z['stickersOnItem']:
                            if 'stickerText' in z.keys():
                                if type(item['stickerText']) is list:
                                    for st in item['stickerText']:
                                        stext.append(st)
                                else:
                                    stext.append(item['stickerText'])
                    text=z['desc']
                elif 'itemInfos' in z.keys():
                    id = z['itemInfos']['id']

                    if 'stickerTextList' in z.keys():
                        for item in z['stickerTextList']:
                            if type(item['stickerText']) is list:
                                for st in item['stickerText']:
                                    stext.append(st)
                    elif 'stickersOnItem' in z.keys():
                        for item in z['stickersOnItem']:
                            if 'stickerText' in z.keys():
                                if type(item['stickerText']) is list:
                                    for st in item['stickerText']:
                                        stext.append(st)
                                else:
                                    stext.append(item['stickerText'])
                    text=z['itemInfos']['text']
                ranking += 1
                if ranking > 2000:
                    break
                if os.path.exists('F:\\Tiktok\\Hashtag\\' + hashtag + '\\' + id + '.mp4') and id in labels and db[hashtag].count_documents({ '_id': id }, limit = 1) == 0:
                    temp=obj
                    temp['_id']=id
                    temp['text_feature']['text']=text
                    temp['text_feature']['stickerText']=stext
                    try:
                        db[hashtag].insert_one(temp)
                        print('insert successful',hashtag,id)
                    except pymongo.errors.DuplicateKeyError:
                        print(id, 'duplicate')